package org.zjvis.datascience.common.util;

import cn.hutool.db.Entity;
import com.alibaba.fastjson.JSONObject;
import org.zjvis.datascience.common.constant.DatasetConstant;
import org.zjvis.datascience.common.dto.DatasetDTO;
import org.zjvis.datascience.common.vo.dataset.HeadVO;

import java.util.Comparator;
import java.util.List;
import java.util.Map;

/**
 * @description 数据集Util
 * @date 2021-10-18
 */
public class DatasetUtil {

    /**
     * 数字正则
     */
    final static String NUMBER_REGEX = "[+-]?([1-9]\\d*|0)(\\.\\d*)?(e[+-]?\\d+(\\.\\d*)?)?";

    /**
     * 身份证匹配
     */
    public final static String ID_REGEX = "([1-6][1-9]|50)\\d{4}(18|19|20)\\d{2}((0[1-9])|10|11|12)(([0-2][1-9])|10|20|30|31)\\d{3}[0-9Xx]";

    /**
     * 2021-01-12  (或2021-1-12)
     * 2001-01-19 03:14:07 (或2001-1-19 3:14:7)
     * 09/26/2016  (或9/26/2016)
     * 2020/02/01  (或2020/2/1)
     * 2020.01.12  (或2020.1.12）
     * 12:04  (或12:4)
     * 02:04:35  (或2:4:35)
     * 2020年3月1日
     * 8th March,2008 (或 8 March,2008  或月份采用简写 Mar. )
     * 1st Mar., 2008 Sunday (或Sun. 1st Mar., 2008)
     * March 8th,2008 (或 March 8,2008  或月份采用简写Mar. )
     * Thursday Dec. 18,2008 (或 Dec. 18,2008 Thur.)
     * Monday (或Mon.)
     * 1st
     * January (或Jan.)
     * 时间戳，有10位（秒）和13位（毫秒）两种
     */
    public final static String TIME_REGEX = "(\\d{1,2}:\\d{1,2}(:\\d{1,2})?(\\.\\d{3})?(\\s(am|pm|AM|PM|a\\.m\\.|p\\.m\\.|A\\.M\\.|P\\.M\\.))?)";

    public final static String WEEKDAY_REGEX = "(Mon\\.|Tue\\.|Wed\\.|Thur\\.|Fri\\.|Sat\\.|Sun\\.|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)";

    public final static String DAY_REGEX = "(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11st|12nd|13rd|14th|15th|16th|17th|18th|19th|20th|21st|22nd|23rd|24th|25th|26th|27th|28th|29th|30th|31st)";

    public final static String MONTH_REGEX = "(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jul\\.|Aug\\.|Sept\\.|Oct\\.|Nov\\.|Dec\\.|January|February|March|April|May|July|August|September|October|November|December)";

    public final static String NORM_DATETIME = "(\\d{4}[年\\.\\-_/\\\\]\\d{1,2}[月\\.\\-_/\\\\]\\d{1,2}日?)";

    public final static String WEST_DATETIME = "(\\d{1,2}[\\.\\-_/\\\\]\\d{1,2}[\\.\\-_/\\\\]\\d{4})";

    public final static String TIMESTAMP_REGEX = "\\d{10}|\\d{13}";

    public final static String COMPLEX_DATETIME = String.format("((((%s|\\d{1,2})\\s%s)|(%s\\s(%s|\\d{1,2})))(,\\s?\\d{4})?)", DAY_REGEX, MONTH_REGEX, MONTH_REGEX, DAY_REGEX);

    public final static String NORM_DATETIME_REGEX = String.format("(%s\\s)?%s(\\s%s)?(\\s%s)?", WEEKDAY_REGEX, NORM_DATETIME, TIME_REGEX, WEEKDAY_REGEX);

    public final static String WEST_DATETIME_REGEX = String.format("(%s\\s)?%s(\\s%s)?(\\s%s)?", WEEKDAY_REGEX, WEST_DATETIME, TIME_REGEX, WEEKDAY_REGEX);

    public final static String COMPLEX_DATETIME_REGEX = String.format("(%s\\s)?%s(\\s%s)?(\\s%s)?", WEEKDAY_REGEX, COMPLEX_DATETIME, TIME_REGEX, WEEKDAY_REGEX);

    public final static String OTHER_DATETIME_REGEX = String.format("%s?%s?%s?", TIME_REGEX, WEEKDAY_REGEX, DAY_REGEX);

    public static boolean isTime(String str) {
        String time = preprocessStr(str);
        return time.matches(MONTH_REGEX) || time.matches(NORM_DATETIME_REGEX) || time.matches(WEST_DATETIME_REGEX) ||
                time.matches(COMPLEX_DATETIME_REGEX) || time.matches(OTHER_DATETIME_REGEX) || time.matches(TIMESTAMP_REGEX);
    }

    public static Comparator<Map.Entry<String, Float>> valueComparator = new Comparator<Map.Entry<String, Float>>() {
        @Override
        public int compare(Map.Entry<String, Float> o1,
                           Map.Entry<String, Float> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
    };

    public static String preprocessStr(String str) {
        String[] ch = {"：", "，", "‘", "’", "”", "“", "【", "】", "　"};
        String[] en = {":", ",", "'", "'", "\"", "\"", "[", "]", " "};
        for (int i = 0; i < ch.length; i++) {
            str = str.replaceAll(ch[i], en[i]);
        }
        if (str.startsWith("\"") && str.endsWith("\"")){
            str = str.substring(1, str.length()-1);
        }
        str = str.trim();
        return str;
    }

    /**
     * 推荐数据类型
     *
     * @param heads
     * @param data
     */
    public static void recommendDataType(List<HeadVO> heads, List<Entity> data) {
        for (HeadVO head : heads) {
            String name = head.getName();
            String type = head.getType();

            for (int i = 0; i < data.size(); i++) {
                if (data.get(i).size() == 0 || data.get(i).getStr(name) == null) {
                    continue;
                }
                String row = data.get(i).getStr(name).trim();
                if (row.isEmpty()) {
                    type = DatasetConstant.DATA_VARCHAR;
                    break;
                }
                row = row.replace("\"\"", "\"");
                if (row.startsWith("\"") && row.endsWith("\"")){
                    row = row.substring(1, row.length()-1);
                }
                if (row.startsWith("[") && row.endsWith("]")) {
                    type = DatasetConstant.DATA_ARRAY;
                    break;
                }
                if (row.equals("{}")) {
                    continue;
                }
                if (row.startsWith("{") && row.endsWith("}")) {
                    try {
                        JSONObject json = JSONObject.parseObject(row);
                        type = DatasetConstant.DATA_JSON;
                        break;
                    } catch (Exception e) {
                    }
                }
                if ((DatasetConstant.DATA_DATE.equalsIgnoreCase(type) || DatasetConstant.DATA_VARCHAR.equalsIgnoreCase(type)) && isTime(row)) {
                    type = DatasetConstant.DATA_DATE;
                    continue;
                } else if (row.matches(NUMBER_REGEX)) {
                    if (row.length() >= DatasetConstant.DATA_NUMBER_MAX_SIZE && !row.contains(".")) {
                        type = DatasetConstant.DATA_VARCHAR;
                        break;
                    }
                    if (row.lastIndexOf(".") > 0) {
                        type = DatasetConstant.DATA_DECIMAL;
                        continue;
                    }
                    if (DatasetConstant.DATA_DECIMAL.equalsIgnoreCase(type)) {
                        continue;
                    }
                    if (!DatasetConstant.DATA_INT.equalsIgnoreCase(type) && row.matches(ID_REGEX)) {
                        type = DatasetConstant.DATA_VARCHAR;
                        continue;
                    }
                    type = DatasetConstant.DATA_INT;
                } else {
                    type = DatasetConstant.DATA_VARCHAR;
                    break;
                }
            }
            head.setType(type);
            head.setRecommendType(type);
        }

    }

    /**
     * 推荐数据类型, 是否考虑head
     *
     * @param heads
     * @param data
     */
    public static List<Entity> recommendDataType(List<HeadVO> heads, List<Entity> data,
                                                 int previewSize, boolean needCheck) {
        if (needCheck) {
            recommendDataType(heads, data);
        }
        return data.subList(0, Math.min(previewSize, data.size()));
    }

    /**
     * 推荐数据类型
     *
     * @param heads
     * @param data
     */
    public static List<Entity> recommendDataType(List<HeadVO> heads, List<Entity> data, int previewSize) {
        recommendDataType(heads, data);
        return data.subList(0, Math.min(previewSize, data.size()));
    }

    public static String extractTableStr(DatasetDTO datasetDTO) {
        JSONObject json = JSONObject.parseObject(datasetDTO.getDataJson());
        return json.getString("schema") + "." + SqlUtil
                .formatPGSqlColName(json.getString("table"));
    }
}
